package com.datahole.suffixarray.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

/**
 * 功能： 字符映射： 中文数字映射到阿拉伯数字 繁体中文映射到简体中文 全角字符映射到半角字符 其他字符映射到自身
 * 
 * 字符分类： 标点字符 数字字符 数字标点字符 英文字符 中文字符 日文字符
 * 
 * @author jzcheng
 * 
 * 
 */
public class CharMapper {
	private static final char[] digitExtChars = { 'Ⅰ', 'Ⅱ', 'Ⅲ', 'Ⅳ', 'Ⅴ', 'Ⅵ',
			'Ⅶ', 'Ⅷ', 'Ⅸ', 'Ⅹ', 'Ⅺ', 'Ⅻ', 'ⅰ', 'ⅱ', 'ⅲ', 'ⅳ', 'ⅴ', 'ⅵ', 'ⅶ',
			'ⅷ', 'ⅸ', 'ⅹ', '①', '②', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩',
			'⑴', '⑵', '⑶', '⑷', '⑸', '⑹', '⑺', '⑻', '⑼', '⑽', '⑾', '⑿', '⒀',
			'⒁', '⒂', '⒃', '⒄', '⒅', '⒆', '⒇', '⒈', '⒉', '⒊', '⒋', '⒌', '⒍',
			'⒎', '⒏', '⒐', '⒑', '⒒', '⒓', '⒔', '⒕', '⒖', '⒗', '⒘', '⒙', '⒚',
			'⒛', '㈠', '㈡', '㈢', '㈣', '㈤', '㈥', '㈦', '㈧', '㈨', '㈩' };

	public enum CharType {
		Other, Dot, Digit, DigitExt, English, Chinese, Japanese
	}

	private static final char[] charMapper = new char[Character.MAX_VALUE];
	private static final CharType[] charType = new CharType[Character.MAX_VALUE];

	static {
		fillCharType();
		fillCharMapper();
	}

	private static void fillCharType() {
		for (char c = 0; c < charType.length; c++) {
			if (isDot(c)) {
				charType[c] = CharType.Dot;
			} else if (isDigit(c)) {
				charType[c] = CharType.Digit;
			} else if (isEnglish(c)) {
				charType[c] = CharType.English;
			} else if (isChinese(c)) {
				charType[c] = CharType.Chinese;
			} else if (isJapanese(c)) {
				charType[c] = CharType.Japanese;
			} else {
				charType[c] = CharType.Other;
			}
		}
		for (char c : digitExtChars) {
			charType[c] = CharType.DigitExt;
		}
	}

	private static void fillCharMapper() {
		for (char c = 0; c < charType.length; c++) {
			charMapper[c] = c;
		}
		loadMapper(CharMapper.class.getResourceAsStream("charmap.txt"),
				charMapper);
	}

	/**
	 * 通过映射库进行转换
	 * 
	 * @param is
	 * @param mapper
	 */
	private static void loadMapper(InputStream is, char[] mapper) {
		try {
			InputStreamReader isr = new InputStreamReader(is, "UTF-8");
			BufferedReader br = new BufferedReader(isr);
			String readline = null;
			while ((readline = br.readLine()) != null) {
				if (readline.length() != 2)
					continue;
				mapper[readline.charAt(0)] = readline.charAt(1);
			}
			br.close();
		} catch (IOException e) {
		}
	}

	/**
	 * 全角转换成半角字符
	 * 
	 * @param c
	 * @return
	 */
	public static char toDBCCase(char c) {
		if (c == 12288)
			return (char) 32;
		if (c >= 65281 && c <= 65374)
			return (char) (c - 65248);
		return c;
	}

	/**
	 * 获取字节的类型
	 * 
	 * @param c
	 * @return
	 */
	public static CharType getType(char c) {
		return charType[c];
	}

	/**
	 * 把字节由繁体转为简体字
	 * 
	 * @param c
	 * @return
	 */
	public static char simp2trad(char c) {
		return charMapper[c];
	}

	/**
	 * 中文标点转为英文标点
	 * 
	 * @param c
	 * @return
	 */
	public static char toEnPun(char c) {
		return charMapper[c];
	}

	public static char toLowerCase(char c) {
		if (c >= 'A' && c <= 'Z')
			return (char) (c + 32);
		return c;
	}

	/**
	 * 把数字转为阿拉伯数字
	 * 
	 * @param c
	 * @return
	 */
	public static char toArab(char c) {
		return charMapper[c];
	}

	/**
	 * 判断字符是否为标点符号
	 * 
	 * @param c
	 * @return
	 */
	public static boolean isDot(char c) {
		if (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION) {
			return true;
		}

		if (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.SPACING_MODIFIER_LETTERS) {
			return true;
		}

		if (Character.isLetterOrDigit(c)) {
			return false;
		}

		return true;
	}

	/**
	 * 判断字节是否为数字
	 * 
	 * @param c
	 * @return
	 */
	public static boolean isDigit(char c) {
		return (c <= '9' && c >= '0') ? true : false;
	}

	/**
	 * 字节是否为英文字母
	 * 
	 * @param c
	 * @return
	 */
	public static boolean isEnglish(char c) {
		return ((c <= 'Z' && c >= 'A') || ((c <= 'z' && c >= 'a'))) ? true
				: false;
	}

	/**
	 * 是否为日语字符
	 * 
	 * @param c
	 * @return
	 */
	private static boolean isJapanese(char c) {
		return Character.UnicodeBlock.of(c) == Character.UnicodeBlock.HIRAGANA
				|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.KATAKANA
				|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.BOPOMOFO;
	}

	/**
	 * 是否为中文字符
	 * 
	 * @param c
	 * @return
	 */
	public static boolean isChinese(char c) {

		if (Character.isLetterOrDigit(c)
				&& Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
			return true;
		}

		return false;

	}
}
